In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
Classification¶
In [2]:
# Cargar el dataset
df_class = pd.read_csv("../data/classification.csv")
In [3]:
# Dimensiones del dataset
df_class.shape
Out[3]:
(1500, 16)
In [4]:
df_class.dtypes
Out[4]:
X1 float64 X2 float64 X3 float64 X4 float64 X5 float64 X6 float64 X7 float64 X8 float64 X9 float64 X10 float64 X11 float64 X12 float64 X13 float64 X14 float64 X15 float64 Y int64 dtype: object
In [5]:
# Primeras filas
df_class.head()
Out[5]:
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 | X14 | X15 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -2.388741 | 6.221087 | 3.442447 | 1.273807 | 0.912272 | 8.908027 | 8.441999 | -2.607863 | -7.539959 | 5.810530 | -6.960356 | -2.460638 | -9.276179 | -4.184623 | -8.896234 | 2 |
| 1 | -6.012792 | -9.884413 | -1.590610 | 4.999943 | 0.247758 | -1.197048 | -10.939272 | 1.533927 | 7.638279 | 6.033323 | 6.045052 | -7.453784 | 8.672267 | 7.474841 | -6.603927 | 0 |
| 2 | 2.270829 | -8.849332 | -6.619179 | -2.861520 | -6.720253 | 5.715418 | 6.493857 | -4.429523 | -3.821490 | -6.438497 | -8.818146 | 3.422536 | 3.376605 | -8.850959 | -0.717626 | 1 |
| 3 | -7.092421 | -10.254081 | -0.907321 | 3.712683 | -0.567676 | 0.254027 | -10.135377 | -0.412888 | 8.421732 | 6.706882 | 4.189576 | -7.909424 | 8.303048 | 8.711827 | -6.575260 | 0 |
| 4 | -2.246293 | 7.617936 | 3.580218 | 2.412760 | 3.881735 | 8.096439 | 8.372886 | -4.655989 | -4.807351 | 5.980022 | -9.098483 | -2.709957 | -9.063194 | -4.737577 | -8.630724 | 2 |
In [6]:
# Información del DataFrame
df_class.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X1 1500 non-null float64 1 X2 1497 non-null float64 2 X3 1500 non-null float64 3 X4 1497 non-null float64 4 X5 1499 non-null float64 5 X6 1500 non-null float64 6 X7 1499 non-null float64 7 X8 1499 non-null float64 8 X9 1500 non-null float64 9 X10 1498 non-null float64 10 X11 1498 non-null float64 11 X12 1499 non-null float64 12 X13 1499 non-null float64 13 X14 1500 non-null float64 14 X15 1500 non-null float64 15 Y 1500 non-null int64 dtypes: float64(15), int64(1) memory usage: 187.6 KB
In [7]:
# Estadísticas descriptivas (numéricas)
df_class.describe()
Out[7]:
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 | X14 | X15 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1500.000000 | 1497.000000 | 1500.000000 | 1497.000000 | 1499.000000 | 1500.000000 | 1499.000000 | 1499.000000 | 1500.000000 | 1498.000000 | 1498.000000 | 1499.000000 | 1499.000000 | 1500.000000 | 1500.000000 | 1500.000000 |
| mean | -1.883910 | -4.075099 | -1.382158 | 0.920043 | -2.471415 | 4.634421 | 2.112540 | -2.174120 | 0.030046 | 1.693476 | -4.200176 | -2.092793 | 0.364311 | -1.789200 | -5.253161 | 1.000000 |
| std | 3.834396 | 7.897076 | 5.051711 | 3.539492 | 3.470882 | 3.955879 | 8.510537 | 1.729567 | 6.403340 | 6.744790 | 6.244312 | 5.019266 | 6.929017 | 6.713716 | 3.604286 | 0.816769 |
| min | -9.028149 | -13.311122 | -10.556390 | -6.713349 | -10.050322 | -3.461197 | -12.871957 | -5.879455 | -8.092116 | -10.879760 | -11.958360 | -11.674289 | -11.888570 | -11.886751 | -11.533303 | 0.000000 |
| 25% | -5.695571 | -9.919480 | -7.157683 | -2.919740 | -6.301859 | 0.371108 | -8.982112 | -3.523258 | -4.819111 | -7.075972 | -8.866134 | -7.156965 | -8.045253 | -8.207881 | -8.090071 | 0.000000 |
| 50% | -1.986221 | -8.899808 | -0.788744 | 2.032352 | -1.602947 | 5.129056 | 6.295880 | -2.547017 | -3.543957 | 5.560343 | -7.848457 | -2.642599 | 1.913803 | -3.520108 | -6.762037 | 1.000000 |
| 75% | 1.998095 | 6.259801 | 3.629647 | 3.778093 | 0.507027 | 8.334606 | 9.074141 | -0.788418 | 8.205810 | 6.782746 | 3.801478 | 3.464049 | 7.215220 | 6.430320 | -1.055850 | 2.000000 |
| max | 6.040819 | 11.162159 | 7.478247 | 7.514823 | 3.915434 | 12.017239 | 12.307936 | 3.351869 | 12.243760 | 9.963494 | 7.883623 | 7.369420 | 10.548179 | 10.408096 | 2.208405 | 2.000000 |
In [8]:
# Resumen de valores faltantes
missing_summary = df_class.isnull().sum().to_frame(name="n_missing")
missing_summary["pct_missing"] = missing_summary["n_missing"] / len(df_class) * 100
print("\nResumen de valores faltantes:")
missing_summary
Resumen de valores faltantes:
Out[8]:
| n_missing | pct_missing | |
|---|---|---|
| X1 | 0 | 0.000000 |
| X2 | 3 | 0.200000 |
| X3 | 0 | 0.000000 |
| X4 | 3 | 0.200000 |
| X5 | 1 | 0.066667 |
| X6 | 0 | 0.000000 |
| X7 | 1 | 0.066667 |
| X8 | 1 | 0.066667 |
| X9 | 0 | 0.000000 |
| X10 | 2 | 0.133333 |
| X11 | 2 | 0.133333 |
| X12 | 1 | 0.066667 |
| X13 | 1 | 0.066667 |
| X14 | 0 | 0.000000 |
| X15 | 0 | 0.000000 |
| Y | 0 | 0.000000 |
In [9]:
df_class.shape
Out[9]:
(1500, 16)
In [10]:
df_class = df_class.dropna()
In [11]:
df_class.shape
Out[11]:
(1485, 16)
In [12]:
# Distribuciones univariadas de variables numéricas
num_cols_class = df_class.select_dtypes(include=[np.number]).columns
df_class[num_cols_class].hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()
In [13]:
for col in [c for c in num_cols_class.to_list() if c != "Y"]:
plt.figure()
plt.scatter(df_class[col], df_class["Y"], alpha=0.6)
plt.title(f"Y vs {col}")
plt.xlabel(col)
plt.ylabel("Y")
plt.show()
In [14]:
from pandas.plotting import scatter_matrix
# Variables numéricas y clase+
numeric_cols_class = num_cols_class.drop("Y")
class_vals = df_class["Y"]
# Scatter matrix
import seaborn as sns
sns.pairplot(
df_class[numeric_cols_class.tolist() + ["Y"]],
hue="Y",
diag_kind="hist",
palette="Set1",
plot_kws={"alpha": 0.6, "marker": "o"},
)
plt.suptitle("Pairplot de Variables Numéricas coloreado por Class", y=1.02, fontsize=16)
plt.show()
In [15]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
corr = df_class[numeric_cols_class.tolist() + ["Y"]].corr()
sns.set_theme(style="white")
mask = np.triu(np.ones_like(corr, dtype=bool))
n = corr.shape[0]
fig, ax = plt.subplots(figsize=(1.2 * n, 1.2 * n))
sns.heatmap(
corr,
mask=mask,
annot=True,
fmt=".2f",
cmap="RdBu_r",
center=0,
linewidths=1.0, # líneas más gruesas entre celdas
square=True, # celdas cuadradas
annot_kws={"size": 8}, # texto más pequeño
cbar_kws={"shrink": 0.6, "label": "Correlación"},
vmin=-1,
vmax=1,
ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=10)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=10)
plt.subplots_adjust(bottom=0.3, left=0.2, top=0.9)
ax.set_title("Mapa de Calor de Correlaciones", fontsize=18, pad=20)
plt.show()
Regression¶
In [17]:
# Cargar el dataset
df_reg = pd.read_csv("../data/regression.csv")
In [18]:
# Dimensiones del dataset
df_reg.shape
Out[18]:
(1500, 13)
In [19]:
df_reg.dtypes
Out[19]:
X1 float64 X2 float64 X3 float64 X4 float64 X5 float64 X6 float64 X7 float64 X8 float64 X9 float64 X10 float64 X11 float64 X12 float64 Y float64 dtype: object
In [20]:
# Primeras filas
df_reg.head()
Out[20]:
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5468.737967 | 1.703822 | -8613.627601 | 0.284719 | -1.585462 | -0.554724 | -1.023031 | -0.632593 | -73183.99886 | -0.119020 | -0.801916 | 0.189757 | -126.781682 |
| 1 | -13104.449210 | -1.019682 | 23061.174510 | -0.820498 | -0.265240 | 2.099421 | -0.316338 | -0.302272 | -327842.82340 | -0.315072 | -1.925923 | -0.687342 | -166.589518 |
| 2 | 6204.272157 | 0.734018 | 133965.765100 | -0.710429 | 0.574379 | 0.511245 | 0.298126 | 0.121694 | 120237.04040 | 0.892858 | -0.474257 | 0.172296 | 171.818818 |
| 3 | 23979.672370 | -1.777481 | 45514.777250 | -0.148987 | -0.006399 | 0.464724 | 0.356851 | 0.770954 | -121603.94100 | 0.507455 | 0.673496 | 1.116824 | 111.490780 |
| 4 | -2390.952073 | -2.418137 | 47102.194950 | 0.384587 | 0.359033 | 0.178320 | -1.063040 | 0.995341 | 139065.42410 | 1.572357 | 0.334709 | -0.403581 | 138.329869 |
In [21]:
# Información del DataFrame
df_reg.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1500 entries, 0 to 1499 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X1 1499 non-null float64 1 X2 1500 non-null float64 2 X3 1495 non-null float64 3 X4 1497 non-null float64 4 X5 1495 non-null float64 5 X6 1498 non-null float64 6 X7 1499 non-null float64 7 X8 1498 non-null float64 8 X9 1500 non-null float64 9 X10 1496 non-null float64 10 X11 1498 non-null float64 11 X12 1499 non-null float64 12 Y 1499 non-null float64 dtypes: float64(13) memory usage: 152.5 KB
In [22]:
# Estadísticas descriptivas (numéricas)
df_reg.describe()
Out[22]:
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1499.000000 | 1500.000000 | 1495.000000 | 1497.000000 | 1495.000000 | 1498.000000 | 1499.000000 | 1498.000000 | 1500.000000 | 1496.000000 | 1498.000000 | 1499.000000 | 1499.000000 |
| mean | -298.970046 | -0.000182 | 2505.655649 | 0.019644 | -0.037554 | -0.012483 | -0.003856 | 0.028775 | 596.758217 | 0.012900 | -0.040321 | 0.010624 | -2.976044 |
| std | 12664.656465 | 1.036299 | 95364.185732 | 1.017798 | 0.975190 | 1.016689 | 0.964161 | 0.993368 | 233211.825271 | 0.999814 | 0.997520 | 0.978725 | 143.364341 |
| min | -39838.521840 | -3.710679 | -336700.253400 | -2.796599 | -3.282985 | -3.922695 | -3.147417 | -4.011049 | -835370.363300 | -3.126188 | -2.937315 | -2.716798 | -519.074640 |
| 25% | -9433.010398 | -0.677534 | -60485.305915 | -0.694018 | -0.698821 | -0.683112 | -0.665480 | -0.640609 | -157687.903625 | -0.686014 | -0.692837 | -0.678441 | -99.192382 |
| 50% | -496.180129 | 0.021941 | 1748.071119 | 0.027749 | -0.034482 | -0.041862 | -0.044291 | 0.027252 | -1684.769781 | -0.013111 | -0.046213 | 0.031184 | -1.531002 |
| 75% | 8318.518644 | 0.705290 | 70803.472755 | 0.728685 | 0.615946 | 0.675086 | 0.613501 | 0.727090 | 163646.360200 | 0.701604 | 0.604628 | 0.676063 | 92.533762 |
| max | 44114.014510 | 3.156200 | 312030.703900 | 3.344649 | 3.515152 | 3.038999 | 3.041686 | 3.508867 | 854211.627100 | 3.218559 | 3.808566 | 3.529275 | 482.853143 |
In [23]:
# Resumen de valores faltantes
missing_summary = df_reg.isnull().sum().to_frame(name="n_missing")
missing_summary["pct_missing"] = missing_summary["n_missing"] / len(df_reg) * 100
print("\nResumen de valores faltantes:")
missing_summary
Resumen de valores faltantes:
Out[23]:
| n_missing | pct_missing | |
|---|---|---|
| X1 | 1 | 0.066667 |
| X2 | 0 | 0.000000 |
| X3 | 5 | 0.333333 |
| X4 | 3 | 0.200000 |
| X5 | 5 | 0.333333 |
| X6 | 2 | 0.133333 |
| X7 | 1 | 0.066667 |
| X8 | 2 | 0.133333 |
| X9 | 0 | 0.000000 |
| X10 | 4 | 0.266667 |
| X11 | 2 | 0.133333 |
| X12 | 1 | 0.066667 |
| Y | 1 | 0.066667 |
In [24]:
df_reg.shape
Out[24]:
(1500, 13)
In [25]:
df_reg = df_reg.dropna()
In [26]:
df_reg.shape
Out[26]:
(1473, 13)
In [27]:
# Distribuciones univariadas de variables numéricas
num_cols_reg = df_reg.select_dtypes(include=[np.number]).columns
df_reg[num_cols_reg].hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()
In [28]:
from pandas.plotting import scatter_matrix
# Variables numéricas y clase+
numeric_cols_reg = num_cols_reg.drop("Y")
reg_vals = df_reg["Y"]
import matplotlib.pyplot as plt
import seaborn as sns
# Preparar la figura con subplots: uno por cada predictor
n_feats = len(numeric_cols_reg)
fig, axes = plt.subplots(n_feats, 1, figsize=(7, 4 * n_feats))
for ax, feat in zip(axes, numeric_cols_reg):
sns.regplot(
x=feat,
y="Y",
data=df_reg,
ax=ax,
scatter_kws={"alpha": 0.5, "s": 20},
line_kws={"color": "red"},
)
ax.set_title(f"{feat} vs Y")
ax.set_xlabel(feat)
ax.set_ylabel("Y")
plt.tight_layout()
plt.show()
In [29]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
corr = df_reg[numeric_cols_reg.tolist() + ["Y"]].corr()
sns.set_theme(style="white")
mask = np.triu(np.ones_like(corr, dtype=bool))
n = corr.shape[0]
fig, ax = plt.subplots(figsize=(1.2 * n, 1.2 * n))
sns.heatmap(
corr,
mask=mask,
annot=True,
fmt=".2f",
cmap="RdBu_r",
center=0,
linewidths=1.0, # líneas más gruesas entre celdas
square=True, # celdas cuadradas
annot_kws={"size": 8}, # texto más pequeño
cbar_kws={"shrink": 0.6, "label": "Correlación"},
vmin=-1,
vmax=1,
ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=10)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=10)
plt.subplots_adjust(bottom=0.3, left=0.2, top=0.9)
ax.set_title("Mapa de Calor de Correlaciones", fontsize=18, pad=20)
plt.show()
Comments¶
Classification¶
- El dataset tiene: 1.500 observaciones y 16 columnas (15 numéricas y 1 categórica de clase Y). Las variables X1–X15 son float y Y es int con valores {0,1,2}
- Menos del 1% de datos faltantes en 8 de las 15 variables (máx. 3 filas en X2/X4), representando sólo ~15 muestras; tras eliminar nulos quedan 1,485 filas.
- Percentiles 25, 50 y 75 en 0,1,2 nos indican clases equilibradas (~500 ejemplos por clase).
- Medias centradas cerca de cero, desviaciones estándar en rango 3–8.
- Casi todas las Xi presentan tres picos (formas multimodales) bien definidos en sus histogramas, lo que podrían ser agrupamientos alineados con las tres clases.
- Variables como X1, X13 o X14 muestran 3 grupos marcados.
- Los scatterplots de Y vs X1, X8, X13 nos confirman agrupamientos clasificables por rangos de cada variable.
- En el pairplot, la mayoría de pares de variables muestran tres nubes separadas.
- En el mapa de calor de correlaciones se aprecian múltiples pares con valores muy fuertes $|ρ|>0.9$. La variable Y se correlaciona positivamente con X6 (+0.96), X7 (+0.93) y negativamente con X13 (–0.98).
- Multicolinealidad: Alta redundancia entre características (varias parejas >0.9), se podrían usar técnicas de selección o reducción de dimensión para evitar sobreajuste.
Regression¶
- El dataset contiene 1 473 observaciones y 13 columnas — 12 predictoras (X1-X12, float) y la variable objetivo Y (float).
- Menos del 2% de datos faltantes repartidos en varias Xi; tras eliminar nulos se conservan las 1 473 filas (pérdida ≈ 1,8%).
- Varias de las variables parecieran seguir distribuciones normales centradas en 0.
- X1, X3, X9 presentan rangos mucho más grandes y colas pesadas.
- Desviaciones estándar distintas y distantes en X1 (~ 12 665), X3 (~ 95 364) y X9 (~ 233 212) frente a ≈ 1 en el resto, podría considerarse escalado para aquellos modelos en los que impacte esto.
- No existe correlación con Y, la mayoría de Xi muestra $ρ≈ 0$.
- Los puntos del scatter plot no siguen un patrón claro y están muy dispersos alrededor de la línea roja (recta de regresión).
- Multicolinealidad mínima: coeficientes Xi-Xj muy bajos (|ρ| < 0,06), por lo que las variables son prácticamente independientes.
In [ ]: